# This script utilizes tree-based methods to segment
# customers based on their engagement and participation variables.
# Load the packages
packList <- c("dplyr", "ggplot2", "randomForest", "rpart", "rpart.plot", "caret", "lubridate", "tidyr")
if (length(setdiff(packList, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packList, rownames(installed.packages())))
}
sapply(packList, library, character.only = TRUE, verbose = FALSE) # load packages for analysis
d <- read.table("./SourceData/FRESNO_DEMOGRAPHICS_SEGMENT.txt", strip.white = TRUE ,
sep = "\t", stringsAsFactors = FALSE, fill = FALSE, header = TRUE)
du <- read.csv("./SourceData/FRESNO_USAGE_TEMP.txt",
stringsAsFactors = FALSE, header = TRUE, strip.white = TRUE )
# Tidy Demographic and Usage Data ---------------------------------------------------------------
# only unique TENUR_sp_id
d <- d %>%
# distinct(TENUR_sp_id) %>%
filter(sampleGroup %in% c("1","2")) # sample group 1 and 2
# create dummy variables - PROGR_elec_enduse_cd
da$num <- 1
ee_spread <- spread(data = da[c('PROGR_elec_enduse_cd','num')],
key = PROGR_elec_enduse_cd, value = num, fill = 0)[2:5]
head(ee_spread)
colnames(ee_spread) <- sapply(colnames(ee_spread),
function(x) paste0("PROGR_elec_enduse_cd_",x, sep = ""))
dim(ee_spread)
rs_spread <- spread(data = da[c('RATES_cmprs_rt_sched_cd','num')],
key = RATES_cmprs_rt_sched_cd, value = num, fill = 0)[2:19]
head(rs_spread)
colnames(rs_spread) <- sapply(colnames(rs_spread),
function(x) paste0("RATES_cmprs_rt_sched_cd_",x, sep = ""))
ee_spread[,1:4] <- sapply(ee_spread[,1:4], as.factor)
sapply(ee_spread, class)
da <- cbind(d, ee_spread, rs_spread)
d <- subset(d, select = -c(num))
dateListD <- colnames(select(d, contains("_dt"), contains("_date"))) # date for demo
dateListU <- colnames(select(du, contains("_dt"), contains("_date"))) # date for usage
# ====== IDENTIFY THE TIMES OF DAY FOR GROUPING =========
# apply as.date to date files
du <- du[du$sampleGroup %in% c("1","2"),] #sample group 1 and 2
du[dateListU] <- lapply(du[dateListU], function(x) as.Date(x, format = "%m/%d/%Y"))
du$day <- weekdays(du$usage_date, abbreviate = FALSE)
du$weekday <- ifelse(du$day %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
du$yr <- format(du$usage_date,"%Y")
# Split usage data by the year
du_train <- du[(du$yr == "2013"),]
du_test <- du[(du$yr == "2014"),]
du_time_day <- du_train %>%
group_by(weekday, sp_id, usage_hour) %>%
summarize(
usage = median(usage)
) %>% mutate(
usage = (usage - mean(usage)) / sd(usage)
)
# Plot the usage against time of day
ggplot(du_time_day, aes(usage_hour, usage)) +
geom_point(position = "jitter", alpha = 1/4) +
geom_smooth(size = 1.1, se = FALSE) +
annotate("rect", xmin = -0.5, xmax = 6, ymin = -2.5, ymax = 2.8, alpha = 0.2) +
annotate("rect", xmin = 16, xmax = 23.5, ymin = -2.5, ymax = 2.8, alpha = 0.2) +
facet_wrap(~weekday) +
ggtitle("Average Usage Pattern") +
xlab("Hour of the Day") +
ylab("Standardized Usage")
# Choosing breaks at 6am and 4pm
du$hour_group <- cut(x = du$usage_hour, breaks = c(-1,6,16,24),
labels = c("0-6","6-16","16-24"))
# ====== CREATE AND VISUALIZE DECISION TREES =========
# Create aggregate for each hourly group and user
da <- du %>%
group_by(yr, hour_group, TENUR_sp_id = sp_id) %>%
summarize(
usage = mean(usage),
temp = mean(mean_temp)
)
# merge with in the demographic data
# da <- merge(da, v, by = "sp_id", all.x = TRUE, all.y = FALSE)
da <- left_join(da, d, by = "TENUR_sp_id")
# # code dwelling type 0 or 1
# da_wide <- spread(data = demoData_wide, DWELL_dwg_typ_cd, num, fill = 0)
# head(demoData_wide)
# da$DWELL_dwg_typ_cd
# lapply(da[dwell_List], function(x) summary(factor(x)))
# has child as 0 or 1
da$has_chld <- ifelse(da$CHILD_hsehd_chld_cnt != 0, 1, 0)
# code pool flasg as 0 or 1
da$HOME_swimming_pool_flg <- binary_Lut[da$HOME_swimming_pool_flg]
# Create attribute lists for joined information
dg <- ungroup(da)
colnames(dg)
ageList <- colnames(select(da[-c(1,2)], starts_with("AGE")))
annualList <- colnames(select(da, starts_with("ANNU")))
toNum <- which(colnames(da) %in% annualList)
da[,toNum] <- sapply(da[,toNum], as.numeric)
sapply(da[,toNum], class)
childList <- colnames(select(da[-c(1,2)], starts_with("CHILD")))
dwell_List <- colnames(select(da[-c(1,2)], starts_with("DWELL")))
engagList <- colnames(select(da[-c(1,2)], starts_with("ENGAG")))
fuel_List <- colnames(select(da, starts_with("FUEL")))
fuelNum <- which(colnames)
home_List <- colnames(select(da[-c(1,2)], starts_with("HOME")))
numbeList <- colnames(select(da[-c(1,2)], starts_with("NUMB")))
premiList <- colnames(select(da[-c(1,2)], starts_with("PREM")))
progrList <- colnames(select(da[-c(1,2)], starts_with("PROG")))
rateList <- colnames(select(da[-c(1,2)], starts_with("RATE")))
residList <- colnames(select(da[-c(1,2)], starts_with("RESID")))
tenurList <- colnames(select(da[-c(1,2)], starts_with("TENUR")))
year_List <- colnames(select(da[-c(1,2)], starts_with("YEAR")))
to <- which(colnames(da)%in%engagList)
class(da[,toAge[2]])
class(da[,toAge[1]])
class(da$AGE_G_adlt_age)
numericLIst <- colnames(select(da, ends_with("age"), ends_with("nbr"), ends_with("cnt")))
factorList <- colnames(select(da, matches("cd"), matches("desc")
, contains("_id"), contains("hour_group"),
ends_with("_ind")))
toNumeric <- which(colnames(da)%in% c(numericLIst, fuel_List, numbeList))
da[,toNumeric] <- sapply(da[,toNumeric], as.numeric)
sapply(da[,toNumeric], class)
catUn
# create binary look up tables
factor_Lut <- c("H" = 1, "C" = 1, "R" = 0, "U" = 0, " " = 0) # use lookup table for binary variable (homeowner O,1)
binary_Lut <- c("Y" = 1, "U" = 0, "N" = "0", " " = 0) #
binary_Lut <- c("Y" = 1, "N" = 0)
#
missing_Lut <- c("?" = " " )
# variables to factors
da[,factorList] <- sapply(da[factorList], as.factor)
summary(da[factorList])
da$DWELL_dwg_typ_cd
# engagment binary
# da$ENGAG_pty_email <- factor(ifelse(test = da$ENGAG_pty_email == "?", 0,1))
da[,engagList] <- sapply(da[engagList], function(x) ifelse(x == "?", "N",
ifelse(x == "N", "N", "Y")))
da[,engagList] <- sapply(da[engagList], function(x) ifelse(x == "0", 0, 1))
summary(da[,engagList])
sapply
da[engagList]
da['ENGAG_pty_email'] <- ifelse(da['ENGAG_pty_email'] == "?", "N", "Y")
sapply(da[engagList], function(x) summary(factor(x)))
# program variables
sapply(da[progrList], function(x) summary(factor(x)))
# Rate variables
sapply(da[rateList], function(x) summary(factor(x)))
# home variables
home_List
da$HOME_hm_ac_typ_dsc <- ifelse(da$HOME_hm_ac_typ_dsc == ".", "Other", da$HOME_hm_ac_typ_dsc)
da[c('HOME_hm_ac_typ_dsc','HOME_hm_ac_typ_dsc')]
sapply(da[home_List], function(x) round(prop.table(table((x))),digits = 2))
# Format variables (Warning messages are OK)
da$segment <- 100
# Split data again by the year
d_train <- da[(da$yr == 2013),]
d_test <- da[(da$yr == 2014),]
# Select columns for inclusion
dput(colnames(da[progrList]))
dput(colnames(da[rateList]))
dput(colnames(da[engagList]))
dput(PROG_INCLUDE)
PROG_INCLUDE <- c(
# "PROGR_elec_enduse_cd"
"PROGR_emp_ind"
, "PROGR_esap_typ_cd"
# , "PROGR_mtr_rdg_rte_cd"
# , "PROGR_net_mtr_ind"
# , "PROGR_sm_endDate"
# , "PROGR_sm_opt_out_end_dt"
# , "PROGR_sm_opt_out_sta_cd"
# , "PROGR_sm_opt_out_start_dt"
, "PROGR_sm_pgm_cd"
# , "PROGR_sm_startDate"
, "PROGR_smart_ac_ind"
, "PROGR_solar_ind"
, "PROGR_elec_enduse_cd_H"
, "PROGR_elec_enduse_cd_M"
, "PROGR_elec_enduse_cd_S"
)
RATE_INCLUDE <- c(
# "RATE_nem_typ_cd"
# , "RATE_revn_acct_cd"
# , "RATE_sa_typ_cd"
# "RATES_cmprs_rt_sched_cd"
# , "RATES_deriv_rt_sched_cd"
# , "RATES_deriv_usg_prfl_cd"
# , "RATES_svc_typ_cd"
"RATES_cmprs_rt_sched_cd_E1L"
, "RATES_cmprs_rt_sched_cd_E6"
, "RATES_cmprs_rt_sched_cd_E7"
, "RATES_cmprs_rt_sched_cd_E7L"
, "RATES_cmprs_rt_sched_cd_E8"
, "RATES_cmprs_rt_sched_cd_E8L"
, "RATES_cmprs_rt_sched_cd_EVB"
)
keepList <- c("AGE_G_adlt_age","ANNUA_avg_scorex_plus_nbr",
"ANNUA_estmtd_hsehd_incm", "CHILD_hsehd_chld_cnt",
"ENGAG_care_ind", "ENGAG_fera_ind", "ENGAG_my_ener_enrl_ind",
"ENGAG_pay_plan_ind", "ENGAG_pty_email", "FUEL_coal_heat_pct",
"FUEL_elec_heat_pct", "FUEL_kerosene_oil_heat_pct",
"FUEL_lp_gas_heat_pct", "FUEL_othr_heat_pct", "FUEL_solar_heat_pct",
"FUEL_util_gas_heat_pct", "FUEL_wood_heat_pct", "HOME_hm_ac_typ_cd",
"HOME_hm_ac_typ_dsc", "HOME_hm_htg_typ_cd", "HOME_hm_htg_typ_dsc",
"HOME_swimming_pool_flg", "HOME_bathrooms_nbr", "HOME_bedrooms_nbr",
"HOME_hm_bas_sq_ft_nbr", "HOME_rooms_nbr", "HOME_stories_nbr",
"NUMBE_hsehd_adlt_cnt", "PREMI_Built_Medn_Yr", "PREMI_cec_climate_zone_cd", "PREMI_cec_climate_zone_desc",
"PREMI_census_blk_grp", "PREMI_cty", "PREMI_Hh_Cnt", "PREMI_Hsg_Unit_Cnt",
"PREMI_Hsg_Unit_Ocpy_Cnt", "PREMI_Hsg_Unit_Owner_Ocpy_Cnt", "PREMI_Hsg_Unit_Rntr_Ocpy_Cnt",
"PREMI_Hsg_Unit_Vcnt_Cnt", "PREMI_Incm_Hh_Medn_Amt", "PREMI_Incm_Hh_Tot_Amt",
"PREMI_Incm_Per_Cap_Amt", "PREMI_opr_area_cd", "PREMI_Val_100K_149K_Cnt",
"PREMI_Val_150K_199K_Cnt", "PREMI_Val_200K_299K_Cnt", "PREMI_Val_300K_499K_Cnt",
"PREMI_Val_500K_999K_Cnt", "PREMI_Val_50K_99K_Cnt", "PREMI_Val_Over_1M_Cnt",
"PREMI_Val_Und_50K_Cnt", "PREMI_wea_stn_cd", "PREMI_zip_cd_12",
"PROGR_elec_enduse_cd", "PROGR_esap_typ_cd", "PROGR_smart_ac_ind",
"PROGR_solar_ind", "RATE_sa_typ_cd", "RATES_cmprs_rt_sched_cd",
"RATES_deriv_rt_sched_cd", "RATES_deriv_usg_prfl_cd", "RESID_homeowner_ind",
"TENUR_sp_id", "TENUR_acct_id", "TENUR_duration", "TENUR_prem_id",
"TENUR_sa_id", "YEAR_hse_age", "YEAR_yr_built")
c("ageList", "annualList", "childList", "dwell_List", "engagList",
"fuel_List", "home_List", "listList", "numbeList", "premiList",
"progrList", "rateList", "residList", "tenurList", "year_List"
)
sapply(da[progrList], function(x) summary(factor(x)))
summary(da$PREMI_Built_Medn_Yr)
str(listList)
cols <- c("usage", engagList, progrList[progrList%in%PROG_INCLUDE],
rateList[rateList%in%RATE_INCLUDE])
# check summary of without usage
lapply(da[cols][-1], function(x) round(prop.table(table((x))),digits = 3))
lapply(da[home_List], function(x) round(prop.table(table((x))),digits = 2))
lab <- c("0-6", "6-16", "16-24")
ct_list <- list()
for (i in 1:3){
# Set subsetting condition
cond_train <- d_train$hour_group == lab[i]
cond_test <- d_test$hour_group == lab[i]
# Fit Tree
ct <- rpart(usage ~., d_train[cond_train, cols],
control = rpart.control(maxdepth = 4,
minbucket = 15))
# Save the trees in a list
ct_list[[i+1]] <- ct
# Assign group membership
ct$frame$target <- ct$frame$yval
ct$frame$yval <- as.numeric(row.names(ct$frame))
d_train[cond_train,]$segment <- predict(ct, d_train[cond_train,])
d_test[cond_test,]$segment <- predict(ct, d_test[cond_test,cols])
}
# Plot The Trees
for (i in 1:3){
prp(ct_list[[i+1]], extra = 1, type = 2, left = FALSE, varlen = 0, nn=TRUE)
title(lab[i], cex=.3, line = 0, adj = 0)
}
# ====== PLOT USAGE VS. TEMPERATURE FOR EACH GROUPING =========
# Create dataframe with sp_id to usage_hour-segment groupings
colnames(d_train)
segments <- unique(d_train[,c("TENUR_sp_id", "hour_group", "segment")])
# segments <- read.csv(paste0(fl, "tree_segments.csv"))
write.csv(segments, "tree_segments.csv", row.names = FALSE)
# Assign each data point a segment
d <- merge(d, segments , by = c("sp_id", "hour_group"), all = TRUE)
# Split data again by the year
d_train <- d[(d$yr == 2013),]
d_test <- d[(d$yr == 2014),]
# Create predictions for each segment and hour group
fit <- dlply(d_train, c("hour_group", "segment"), function(x)
earth(usage ~ mean_temp, data = x, Scale.y = FALSE))
d_test <- ddply(d_test, c("hour_group", "segment"), function(x)
transform(x, yHat = predict(fit[[paste(x[1,c("hour_group", "segment")], collapse=".")]],
newdata = x)))
# Create charts for the estimated lines for each group
ggplot(d_test, aes(mean_temp,
usage.1,
group = factor(segment),
colour =factor(hour_group))) +
geom_line(size = .75) +
facet_wrap(~usage_hour, ncol = 4) +
scale_y_continuous("Predicted Electricity Usage") +
scale_x_continuous("Temperature") +
theme(legend.title=element_blank()) +
ggtitle("Estimated Electricity Usage and Temperature By Customer Segment And Time Of Day")
# ====== COMPUTE RF IMPORTANCE AND PLOT BY HOUR OF DAY ========
lab <- c("0-6", "6-16", "16-24")
scores_rf <- data.frame()
for (j in 1:3){
cond <- (da$hour_group == lab[j])
rf <- randomForest(usage ~., da[cond, cols], na.action = na.omit)
tmp <- data.frame(imp = unname(rf$importance),
hour_group = lab[j],
vars = row.names(rf$importance))
scores_rf <- rbind(scores_rf, tmp)
}
da$PROGR_elec_enduse_cd_S <- as.factor(da$PROGR_elec_enduse_cd_S)
summary(da[cond, cols])
da[progrList] <- as.factor(da[progrList][1:7])
scores_rf$
# Format the hour group variable
scores_rf$hour_group <- factor(scores_rf$hour_group,
levels = c("0-6", "6-16", "16-24"),
labels = c("Morning", "Day", "Evening"))
# Visualize Importance Scores
ggplot(scores_rf, aes(imp, vars, color = hour_group)) +
geom_point(size = 4) +
xlab("Importance Scores") +
ylab("Variables") +
geom_line(aes(group = hour_group), stat="vline", xintercept="mean", size = 0.9) +
# facet_grid(~year) +
ggtitle("Variable Importance Scores for Random Forest")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.